library(cowplot) # plot_grid()
library(dotenv) # load_dot_env()
library(dplyr) # left_join()
library(ggplot2) # ggplot()
library(gridExtra) # grid.arrange()
library(Matrix) # rowSums()
library(parallel) # detectCores()
library(rtracklayer) # import()
library(scCustomize) # Read_CellBender_h5_Mat()
library(Seurat) # Read10X_h5()
library(stringr) # str_match()
# user variables
out <- "../../results/pass_1/all_clusters/"
sample_order <- c("E3.M","E3.F","E4.M","E4.F")
sample_colors <- c("#26946A","#1814A1","#EAC941","#DF5F00")
sample_order2 <- c("Male E3", "Male E4", "Female E3", "Female E4")
isoform_order <- c("E4","E3")
isoform_colors <- c("darkgray","cornflowerblue")
sex_order <- c("Male","Female")
sex_colors <- c("darkgray","purple")
# environment variables
load_dot_env(file = "../../refs/.env")
counts_dir <- Sys.getenv("COUNTS_DIR")
# thresholds
nCount.min <- 500
nCount.max <- 20000
nFeature.min <- 250
complexity.cutoff <- 0.8
mt.cutoff <- 20
ribo.cutoff <- 50
hb.cutoff <- 3
# functions
source("../../../functions/Kennedi_single_cell_functions.R")
saveToPDF <- function(...) {
d = dev.copy(pdf,...)
dev.off(d)
}
if (file.exists("../../rObjects/cellbender_h5.rds")) {
mouse <- readRDS("../../rObjects/cellbender_h5.rds")
} else {
# individual sample objects
E3.F <- CreateSeuratObject(
scCustomize::Read_CellBender_h5_Mat(paste0(counts_dir, "/E3_14M_F/outs/E3_14M_F_cellbender_filtered.h5")))
E3.M <- CreateSeuratObject(
scCustomize::Read_CellBender_h5_Mat(paste0(counts_dir, "/E3_14M_M/outs/E3_14M_M_cellbender_filtered.h5")))
E4.F <- CreateSeuratObject(
scCustomize::Read_CellBender_h5_Mat(paste0(counts_dir, "/E4_14M_F/outs/E4_14M_F_cellbender_filtered.h5")))
E4.M <- CreateSeuratObject(
scCustomize::Read_CellBender_h5_Mat(paste0(counts_dir, "/E4_14M_M/outs/E4_14M_M_cellbender_filtered.h5")))
# merge objects
mouse <- merge(x = E3.F,
y = c(E3.M,E4.F,E4.M),
add.cell.ids = c("E3.F","E3.M","E4.F","E4.M"),
project = "Mouse Meningeal Dura scRNAseq")
# cleanup and save
remove(E3.F, E3.M, E4.F, E4.M)
saveRDS(mouse, "../../rObjects/raw_h5.rds")
}
# join layers
mouse <- JoinLayers(mouse)
# preview
mouse
## An object of class Seurat
## 32285 features across 37976 samples within 1 assay
## Active assay: RNA (32285 features, 0 variable features)
## 1 layer present: counts
# read in annotation file, GENCODE GRCm38 version M23 (Ensembl 98)
if (file.exists("../../rObjects/annotation.rds")) {
genes <- readRDS("../../rObjects/annotation.rds")
} else {
gtf.file <- "../../refs/mouse_genes.gtf"
genes <- rtracklayer::import(gtf.file)
genes <- as.data.frame(genes)
genes <- genes[genes$type == "gene",]
saveRDS(genes, "../../rObjects/annotation.rds")
}
# create sample column
barcodes <- colnames(mouse)
pattern <- "(.+)_[ACGT]+-(\\d+)"
sample <- str_match(barcodes, pattern)[,2]
table(sample) # check
## sample
## E3.F E3.M E4.F E4.M
## 7589 9690 12541 8156
mouse$sample <- factor(sample, levels = sample_order)
Idents(mouse) <- mouse$sample
# age column
mouse$age <- "14 months"
# sex column
sex <- str_match(mouse$sample, "E\\d.([FM])")[,2]
sex <- gsub("F","Female",sex)
sex <- gsub("M","Male",sex)
mouse$sex <- factor(sex, levels = sex_order)
# Apoe isoform column
isoform <- str_match(mouse$sample, "(E\\d).[FM]")[,2]
mouse$isoform <- factor(isoform, levels = isoform_order)
# sample2 column (for title formatting)
mouse$sample2 <- paste(mouse$sex, mouse$isoform, sep = " ")
# cell.complexity
mouse$cell.complexity <- log10(mouse$nFeature_RNA) / log10(mouse$nCount_RNA)
# percent.mt
mt.genes <- genes[genes$seqnames == "chrM",13]
mouse$percent.mt <- PercentageFeatureSet(mouse, features = mt.genes)
mt.genes
## [1] "mt-Nd1" "mt-Nd2" "mt-Co1" "mt-Co2" "mt-Atp8" "mt-Atp6" "mt-Co3"
## [8] "mt-Nd3" "mt-Nd4l" "mt-Nd4" "mt-Nd5" "mt-Nd6" "mt-Cytb"
# percent.ribo
# ribosomal proteins begin with 'Rps' or 'Rpl' in this annotation file
# mitochondrial ribosomes start with 'Mrps' or 'Mrpl' (not on MT chromosome)
gene.names <- genes$gene_name
ribo <- gene.names[grep("^Rp[sl]", gene.names)]
mt.ribo <- gene.names[grep("^Mrp[sl]", gene.names)]
ribo.combined <- c(mt.ribo,ribo)
mouse$percent.ribo.protein <- PercentageFeatureSet(mouse, features = ribo.combined)
ribo.combined
## [1] "Mrpl15" "Mrpl30" "Mrps9" "Mrpl44" "Mrps14"
## [6] "Mrpl41" "Mrps2" "Mrps5" "Mrps26" "Mrps28"
## [11] "Mrpl47" "Mrpl24" "Mrpl9" "Mrps21" "Mrpl50"
## [16] "Mrpl37" "Mrps15" "Mrpl20" "Mrpl33" "Mrpl1"
## [21] "Mrps18c" "Mrps17" "Mrps33" "Mrpl35" "Mrpl19"
## [26] "Mrpl53" "Mrps25" "Mrpl51" "Mrps35" "Mrps12"
## [31] "Mrpl46" "Mrps11" "Mrpl48" "Mrpl17" "Mrpl23"
## [36] "Mrps31" "Mrpl34" "Mrpl4" "Mrps22" "Mrpl3"
## [41] "Mrpl54" "Mrpl42" "Mrps24" "Mrpl22" "Mrpl55"
## [46] "Mrps23" "Mrpl27" "Mrpl10" "Mrpl45" "Mrpl58"
## [51] "Mrps7" "Mrpl38" "Mrpl12" "Mrpl32" "Mrpl36"
## [56] "Mrps27" "Mrps36" "Mrps30" "Mrps16" "Mrpl52"
## [61] "Mrpl57" "Mrpl13" "Mrpl40" "Mrpl39" "Mrps6"
## [66] "Mrpl18" "Mrps34" "Mrpl28" "Mrps18b" "Mrpl14"
## [71] "Mrps18a" "Mrpl2" "Mrps10" "Mrpl21" "Mrpl11"
## [76] "Mrpl49" "Mrpl16" "Mrpl43" "Rpl7" "Rpl31"
## [81] "Rpl37a" "Rps6kc1" "Rpl7a" "Rpl12" "Rpl35"
## [86] "Rps21" "Rpl22l1" "Rps3a1" "Rps27" "Rpl34"
## [91] "Rps20" "Rps6" "Rps8" "Rps6ka1" "Rpl11"
## [96] "Rpl22" "Rpl9" "Rpl5" "Rplp0" "Rpl6"
## [101] "Rpl21" "Rpl32" "Rps9" "Rpl28" "Rps5"
## [106] "Rps19" "Rps16" "Rps11" "Rpl13a" "Rpl18"
## [111] "Rps17" "Rps3" "Rpl27a" "Rps13" "Rps15a"
## [116] "Rplp2" "Rpl18a" "Rpl13" "Rps25" "Rpl10-ps3"
## [121] "Rplp1" "Rpl4" "Rps27l" "Rpl29" "Rps27rt"
## [126] "Rpsa" "Rpl14" "Rps12" "Rps15" "Rpl41"
## [131] "Rps26" "Rps27a" "Rpl26" "Rpl23a" "Rpl9-ps1"
## [136] "Rps6kb1" "Rpl23" "Rpl19" "Rpl27" "Rpl38"
## [141] "Rps7" "Rpl10l" "Rps29" "Rpl36al" "Rps6kl1"
## [146] "Rps6ka5" "Rps23" "Rpl15" "Rps24" "Rpl36a-ps1"
## [151] "Rpl37" "Rpl30" "Rpl8" "Rpl3" "Rps19bp1"
## [156] "Rpl39l" "Rpl35a" "Rpl24" "Rps6ka2" "Rps2"
## [161] "Rpl3l" "Rps10" "Rpl10a" "Rps28" "Rps18"
## [166] "Rpl7l1" "Rpl36" "Rpl36-ps4" "Rps14" "Rpl17"
## [171] "Rps6kb2" "Rps6ka4" "Rpl9-ps6" "Rpl39" "Rpl10"
## [176] "Rps4x" "Rps6ka6" "Rpl36a" "Rps6ka3"
# percent.hb - hemoglobin proteins begin with 'Hbb' or 'Hba' for mouse
hb.genes <- gene.names[grep("^Hb[ba]-", gene.names)]
mouse$percent.hb <- PercentageFeatureSet(mouse, features = hb.genes)
hb.genes
## [1] "Hbb-bt" "Hbb-bs" "Hbb-bh2" "Hbb-bh1" "Hbb-y" "Hba-x" "Hba-a1"
## [8] "Hba-a2"
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse$sample))
colnames(data) <- c("sample","frequency")
ncells1 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,15000, by = 5000), limits = c(0,15000)) +
ggtitle("Raw: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
ncells1
# Visualize nCount_RNA
den1 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# normalize data for violin plots
mouse <- NormalizeData(mouse, assay = "RNA")
## Normalizing layer: counts
# nFeature, nCount, and cell.complexity violins
v1 <- VlnPlot(mouse,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v1
# percent violins
v2 <- VlnPlot(mouse,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v2
s1 <- ggplot(
mouse@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s1
## `geom_smooth()` using formula = 'y ~ x'
s2 <- FeatureScatter(mouse,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s2
# filter
mouse.filtered <- subset(mouse,
subset = (nCount_RNA > nCount.min) &
(nCount_RNA < nCount.max) &
(nFeature_RNA > nFeature.min) &
(cell.complexity > complexity.cutoff) &
(percent.mt < mt.cutoff) &
(percent.hb < hb.cutoff))
# print cells removed
print(paste0(dim(mouse)[2] - dim(mouse.filtered)[2]," cells removed"))
## [1] "21011 cells removed"
Remove lowly expressed genes. We will keep genes that have at least 1 count in 10 cells.
# filter genes
mouse.filtered <- JoinLayers(mouse.filtered)
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
nonzero <- counts > 0 # produces logical
keep <- Matrix::rowSums(nonzero) >= 10 # sum the true/false
counts.filtered <- counts[keep,] # keep certain genes
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "10916 features removed"
Remove highly abundant transcripts.
# remove mt.genes
counts <- GetAssayData(object = mouse.filtered, layer = "counts")
keep <- !rownames(counts) %in% mt.genes # false when mt.gene
counts.filtered <- counts[keep,]
# remove ribo.genes
keep <- !rownames(counts.filtered) %in% ribo.combined
counts.filtered <- counts.filtered[keep,]
# remove Ig genes + Jchain but keep Igha + Ighd to enahnce clustering
gene.types <- c("IG_C_gene","IG_C_pseudogene","IG_D","IG_J_gene","IG_LV_gene",
"IG_V_gene","IG_V_pseudogene")
keep <- (genes$gene_type) %in% gene.types
ig.genes <- genes[keep,]
ig.genes <- c(ig.genes$gene_name, "Jchain")
ig.genes <- ig.genes[-c(185,192)] # keep Igha and Ighd
ig.genes <- ig.genes[!ig.genes %in% c("Igha","Ighd")]
ig.genes
## [1] "Gm17472" "Gm20730" "Igkv2-137" "Igkv1-136" "Igkv1-135"
## [6] "Igkv14-134-1" "Igkv17-134" "Igkv1-133" "Gm43284" "Igkv1-132"
## [11] "Igkv1-131" "Igkv14-130" "Igkv9-129" "Igkv9-128" "Igkv17-127"
## [16] "Igkv14-126-1" "Igkv14-126" "Igkv11-125" "Igkv9-124" "Igkv9-123"
## [21] "Igkv1-122" "Igkv17-121" "Igkv9-120" "Igkv9-119" "Igkv14-118-2"
## [26] "Igkv14-118-1" "Gm43739" "Igkv11-118" "Igkv1-117" "Igkv2-116"
## [31] "Igkv1-115" "Igkv11-114" "Igkv2-113" "Igkv2-112" "Igkv14-111"
## [36] "Igkv1-110" "Igkv2-109" "Igkv1-108" "Igkv2-107" "Igkv11-106"
## [41] "Igkv2-105" "Igkv16-104" "Igkv15-103" "Igkv15-102" "Igkv20-101-2"
## [46] "Igkv15-101-1" "Igkv15-101" "Igkv14-100" "Igkv1-99" "Igkv12-98"
## [51] "Igkv15-97" "Igkv10-96" "Igkv2-95-2" "Igkv2-95-1" "Igkv10-95"
## [56] "Igkv10-94" "Igkv2-93-1" "Igkv19-93" "Igkv4-92" "Gm42925"
## [61] "Igkv4-91" "Igkv4-90" "Gm42924" "Igkv12-89" "Igkv1-88"
## [66] "Gm42539" "Gm42543" "Gm42542" "Igkv13-87" "Igkv4-86"
## [71] "Igkv13-85" "Igkv13-84" "Igkv4-83" "Igkv13-82" "Igkv4-81"
## [76] "Igkv13-80-1" "Igkv4-80" "Igkv4-79" "Igkv13-78-1" "Igkv4-78"
## [81] "Gm43583" "Igkv4-77" "Igkv13-76" "Igkv4-75" "Igkv13-74-1"
## [86] "Igkv4-74" "Igkv13-73-1" "Igkv4-73" "Igkv4-72" "Igkv13-71-1"
## [91] "Igkv4-71" "Igkv4-70" "Igkv4-69" "Igkv4-68" "Igkv12-67"
## [96] "Igkv12-66" "Igkv4-65" "Igkv13-64" "Igkv4-63" "Igkv13-62-1"
## [101] "Igkv4-62" "Igkv13-61-1" "Igkv4-61" "Igkv4-59" "Igkv4-60"
## [106] "Igkv4-58" "Igkv13-57-2" "Igkv4-57-1" "Igkv13-57-1" "Igkv4-57"
## [111] "Igkv13-56-1" "Igkv4-56" "Igkv13-55-1" "Igkv4-55" "Igkv13-54-1"
## [116] "Igkv4-54" "Igkv4-53" "Gm42606" "Igkv4-51" "Gm42958"
## [121] "Igkv4-50" "Igkv12-49" "Igkv5-48" "Igkv12-47" "Igkv12-46"
## [126] "Igkv5-45" "Igkv12-44" "Igkv5-43" "Igkv12-42" "Igkv12-41"
## [131] "Igkv5-40-1" "Igkv12-40" "Igkv5-39" "Gm43220" "Igkv12-38"
## [136] "Igkv5-37" "Igkv18-36" "Igkv1-35" "Gm43586" "Igkv8-34"
## [141] "Igkv7-33" "Igkv6-32" "Igkv8-31" "Igkv8-30" "Igkv6-29"
## [146] "Igkv8-28" "Igkv8-27" "Igkv8-26" "Igkv6-25" "Igkv8-24"
## [151] "Gm43218" "Igkv6-23" "Igkv8-22" "Igkv8-21" "Igkv6-20"
## [156] "Igkv8-19" "Igkv8-18" "Igkv6-17" "Igkv8-16" "Gm42667"
## [161] "Igkv6-15" "Gm10360" "Igkv6-14" "Igkv6-13" "Gm42720"
## [166] "Igkv3-12-1" "Igkv3-12" "Igkv3-11" "Igkv3-10" "Igkv3-9"
## [171] "Igkv3-8" "Igkv3-7" "Igkv3-6" "Igkv3-5" "Igkv3-4"
## [176] "Igkv3-3" "Igkv3-2" "Igkv3-1" "Igkj1" "Igkj2"
## [181] "Igkj3" "Igkj4" "Igkj5" "Igkc" "Ighe"
## [186] "Ighg2c" "Ighg2b" "Gm37944" "Ighg1" "Ighg3"
## [191] "Ighm" "Ighj4" "Ighj3" "Ighj2" "Ighj1"
## [196] "Ighv5-1" "Ighv2-1" "Ighv5-2" "Ighv2-2" "Ighv5-3"
## [201] "Ighv5-4" "Ighv6-1" "Ighv2-3" "Ighv5-5" "Ighv5-6"
## [206] "Ighv5-7" "Ighv2-4" "Ighv5-8" "Ighv5-8" "Ighv5-9"
## [211] "Ighv5-10" "Ighv2-5" "Ighv5-11" "Ighv5-12" "Ighv2-6"
## [216] "Gm37976" "Gm37418" "Ighv5-9-1" "Gm38203" "Ighv5-12-4"
## [221] "Gm38205" "Ighv2-9-1" "Gm37722" "Ighv5-13" "Ighv2-6-8"
## [226] "Gm7003" "Ighv2-7" "Ighv5-15" "Ighv5-16" "Ighv5-17"
## [231] "Ighv5-18" "Ighv2-8" "Ighv2-9" "Ighv5-19" "Ighv7-1"
## [236] "Ighv7-2" "Ighv14-1" "Ighv4-1" "Ighv3-1" "Gm37961"
## [241] "Ighv11-1" "Ighv14-2" "Ighv4-2" "Ighv3-2" "Ighv11-2"
## [246] "Ighv14-3" "Ighv16-1" "Ighv6-2" "Ighv9-1" "Ighv12-1"
## [251] "Ighv9-2" "Ighv12-2" "Ighv9-3" "Ighv7-3" "Ighv15-1"
## [256] "Ighv14-4" "Ighv3-3" "Ighv7-4" "Ighv3-4" "Ighv3-5"
## [261] "Ighv13-1" "Ighv3-6" "Ighv9-4" "Ighv3-7" "Ighv5-21"
## [266] "Ighv3-8" "Ighv8-1" "Ighv1-1" "Ighv13-2" "Ighv12-3"
## [271] "Ighv6-3" "Ighv6-4" "Ighv6-5" "Ighv6-6" "Ighv6-7"
## [276] "Ighv8-2" "Ighv1-2" "Ighv10-1" "Ighv1-3" "Ighv1-4"
## [281] "Ighv10-2" "Ighv1-5" "Ighv10-3" "Ighv1-6" "Ighv1-7"
## [286] "Ighv10-4" "Ighv1-8" "Ighv15-2" "Ighv1-9" "Ighv1-10"
## [291] "Ighv1-11" "Ighv1-12" "Ighv1-13" "Ighv1-13" "Ighv1-14"
## [296] "Ighv1-15" "Ighv1-16" "Ighv1-17-1" "Ighv1-17" "Ighv1-18"
## [301] "Ighv1-19-1" "Ighv1-19" "Ighv1-20" "Ighv1-21-1" "Ighv1-21"
## [306] "Ighv1-22" "Ighv1-23" "Ighv1-24" "Ighv1-25" "Ighv1-26"
## [311] "Ighv1-27" "Ighv1-28" "Ighv1-29" "Ighv1-30" "Ighv1-31"
## [316] "Ighv1-32" "Ighv1-33" "Ighv1-34" "Ighv1-35" "Ighv1-36"
## [321] "Ighv1-37" "Ighv1-38" "Ighv1-39" "Ighv1-40" "Ighv1-41"
## [326] "Ighv1-42" "Ighv1-43" "Ighv1-44" "Ighv1-45" "Ighv1-46"
## [331] "Ighv1-47" "Ighv8-3" "Ighv8-4" "Ighv1-48" "Ighv1-49"
## [336] "Ighv8-5" "Ighv1-50" "Ighv1-51" "Ighv1-52" "Ighv1-53"
## [341] "Ighv8-6" "Ighv1-54" "Ighv1-55" "Ighv1-56" "Ighv8-7"
## [346] "Ighv1-57" "Ighv8-8" "Ighv1-58" "Ighv1-59" "Ighv1-60"
## [351] "Ighv1-61" "Ighv1-62" "Ighv1-62-1" "Gm37511" "Gm38184"
## [356] "Ighv1-62-2" "Ighv1-62-3" "Ighv8-9" "Ighv1-63" "Ighv1-64"
## [361] "Ighv8-10" "Ighv1-65" "Ighv8-11" "Ighv1-66" "Ighv1-67"
## [366] "Ighv1-68" "Ighv1-69" "Ighv8-12" "Ighv1-70" "Ighv1-71"
## [371] "Ighv1-72" "Ighv8-13" "Ighv1-73" "Ighv1-74" "Ighv8-14"
## [376] "Ighv1-75" "Ighv8-15" "Ighv1-76" "Ighv8-16" "Ighv1-77"
## [381] "Ighv1-78" "Gm42643" "Ighv1-79" "Ighv1-80" "Ighv1-81"
## [386] "Gm42990" "Ighv1-82" "Ighv1-83" "Ighv1-84" "Ighv1-85"
## [391] "Ighv1-86" "Iglc1" "Iglj1" "Iglc3" "Iglj3p"
## [396] "Iglj3" "Iglv1" "Iglc4" "Iglj4" "Iglc2"
## [401] "Iglj2" "Iglv3" "Iglv2" "Gm50428" "Gm50427"
## [406] "Gm50426" "Jchain"
keep <- !rownames(counts.filtered) %in% ig.genes
counts.filtered <- counts.filtered[keep,]
# overwrite mouse.filtered
mouse.filtered <- CreateSeuratObject(counts.filtered,
meta.data = mouse.filtered@meta.data)
# print features removed
print(paste0(dim(counts)[1] - dim(counts.filtered)[1], " features removed"))
## [1] "280 features removed"
# cleanup data
remove(mouse,counts,counts.filtered,nonzero)
# Visualize the number of cell counts per sample
data <- as.data.frame(table(mouse.filtered$sample))
colnames(data) <- c("sample","frequency")
ncells2 <- ggplot(data, aes(x = sample, y = frequency, fill = sample)) +
geom_col() +
theme_classic() +
geom_text(aes(label = frequency),
position=position_dodge(width=0.9),
vjust=-0.25) +
scale_fill_manual(values = sample_colors) +
scale_y_continuous(breaks = seq(0,12000, by = 2000), limits = c(0,12000)) +
ggtitle("Filtered: cells per sample") +
theme(legend.position = "none") +
theme(axis.text.x = element_text(angle = 45, hjust=1))
# Arrange graphs in grid
plots <- list(ncells1,ncells2)
layout <- rbind(c(1),c(2))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# Visualize nCount_RNA
den1 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nCount_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("nCount_RNA") +
ylab("Density") +
theme(legend.position = "none") +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_vline(xintercept = nCount.max, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize nFeature_RNA
den2 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = nFeature_RNA,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("nFeature_RNA") +
ylab("Density") +
geom_vline(xintercept = nFeature.min, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize cell complexity
# Quality cells are usually above 0.85
den3 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = cell.complexity,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("Cell Complexity (log10(nFeature/nCount))") +
ylab("Density") +
geom_vline(xintercept = complexity.cutoff, linetype = "dashed", colour = "red") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.mt
den4 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.mt,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_continuous(n.breaks = 4) +
geom_vline(xintercept = mt.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Mitochondrial Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.ribo.protein
den5 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.ribo.protein,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Ribosomal Protein Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Visualize percent.hb
den6 <- ggplot(mouse.filtered@meta.data,
aes(color = sample,
x = percent.hb,
fill = sample)) +
geom_density(alpha = 0.2) +
theme_classic() +
scale_x_log10() +
geom_vline(xintercept = hb.cutoff, linetype = "dashed", colour = "red") +
scale_color_manual(values = sample_colors) +
theme(legend.position = "none") +
scale_fill_manual(values = sample_colors) +
xlab("% Hemoglobin Genes") +
ylab("Density") +
theme(legend.key.size = unit(0.25, 'cm'), legend.title = element_text(size=9))
# Arrange graphs in grid
plots <- list(den1,den2,den3,den4,den5,den6)
layout <- rbind(c(1,4),c(2,5),c(3,6))
grid <- grid.arrange(grobs = plots, layout_matrix = layout)
# normalize data before violin plots
mouse.filtered <- NormalizeData(mouse.filtered)
## Normalizing layer: counts
# nFeature, nCount, and cell.complexity violins
v3 <- VlnPlot(mouse.filtered,
features = c("nFeature_RNA", "nCount_RNA","cell.complexity"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v3
# percent violins
v4 <- VlnPlot(mouse.filtered,
features = c("percent.mt","percent.ribo.protein","percent.hb"),
ncol = 3,
group.by = 'sample',
cols = sample_colors,
pt.size = 0)
v4
s3 <- ggplot(
mouse.filtered@meta.data,
aes(x = nCount_RNA, y = nFeature_RNA, color = percent.mt)) +
geom_point() +
geom_smooth(method = "lm") +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = nCount.min, linetype = "dashed", colour = "red") +
geom_hline(yintercept = nFeature.min, linetype = "dashed", colour = "red") +
facet_wrap(~sample) +
scale_colour_gradient(low = "gray90", high = "black", limits =c(0,100))
s3
## `geom_smooth()` using formula = 'y ~ x'
s4 <- FeatureScatter(mouse.filtered,
feature1 = "nCount_RNA",
feature2 = "percent.mt",
group.by = 'sample',
cols = sample_colors,
shuffle = TRUE)
s4
# Visualize the distribution of genes detected per cell via boxplot
b1 <- ggplot(mouse.filtered@meta.data,
aes(x = sample,
y = log10(nFeature_RNA),
fill=sample)) +
geom_boxplot() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ggtitle("Unique Genes / Cell / Sample") +
scale_color_manual(values = sample_colors) +
scale_fill_manual(values = sample_colors) +
xlab("Sample")
b1
df <- data.frame(gene_name = rownames(mouse.filtered))
df$rsum <- rowSums(x = mouse.filtered, slot = "counts")
df <- df[order(df$rsum, decreasing = TRUE),]
rownames(df) <- 1:nrow(df)
head(df, 10)
## gene_name rsum
## 1 Gm42418 9370566
## 2 Malat1 3367384
## 3 Tmsb4x 650153
## 4 Cmss1 616402
## 5 S100a9 597899
## 6 Camk1d 473680
## 7 S100a8 432725
## 8 Cd74 400812
## 9 Actb 363416
## 10 AY036118 350427
markers <- "../../../functions/cell_cycle_markers.tsv"
path <- paste0(out, "unwanted_variation")
mouse.filtered[["phase"]] <- cell_cycle_QC(obj = mouse.filtered,
markersPath = markers,
species = "mouse",
outDir = path)
mouse.filtered[["mito.factor"]] <- mitochondria_QC(obj = mouse.filtered,
outDir = path)
# split
mouse.filtered[["RNA"]] <- split(mouse.filtered[["RNA"]],
f = mouse.filtered$sample)
# transform
mouse.filtered <- SCTransform(mouse.filtered, verbose = FALSE)
# run PCA on the merged object
mouse.filtered <- RunPCA(object = mouse.filtered, assay = "SCT")
## Warning in PrepDR(object = object, features = features, verbose = verbose): The
## following 14 features requested have not been scaled (running reduction without
## them): Rag1, Igll1, Klk1, S100a5, Mcpt8, Prss34, Cldn19, Gm30948, Stoml3,
## Kcnj3, Fut9, Omp, Gm30613, Folr1
## PC_ 1
## Positive: Cd74, Lyz2, Mrc1, H2-Eb1, C1qa, H2-Aa, H2-Ab1, F13a1, Apoe, C1qb
## Slc9a9, Arhgap15, C1qc, Pf4, Slc8a1, Bank1, Pid1, Mctp1, Ptprc, Tyrobp
## S100a9, Rbpj, S100a8, Zeb2, Ccl8, Ctss, Inpp4b, Skap1, Csf1r, Cd52
## Negative: Gpc6, Flt1, Cdh11, Foxp2, Igfbp5, Slc4a10, Ldb2, Ptprg, Bnc2, Ptprb
## Mgp, Pcdh7, Prkg1, Plpp3, Mir100hg, Col12a1, Auts2, Col1a2, Cacna1c, Ptprm
## Plcb1, Cyyr1, Tenm3, Igfbp7, Ptprd, Emcn, Nfib, Cald1, Mecom, Stox2
## PC_ 2
## Positive: Cdh11, Gpc6, Slc4a10, Igfbp5, Foxp2, Bnc2, Col12a1, Col1a2, Mir100hg, Tenm3
## Cacna1c, Ptprd, Slc38a2, Col1a1, Zfhx4, Slit2, Prrx1, Epha3, Mgp, Lama2
## Egfr, Pdzrn3, Slc47a1, Naaladl2, Ank2, Ogn, Tspan11, Map1b, Boc, Igf2
## Negative: Flt1, Ptprb, Ldb2, Ptprg, Plcb1, Cyyr1, Emcn, Kdr, Adgrl4, Igfbp3
## Ptprm, Stox2, Plpp1, Podxl, Mecom, Plvap, Rapgef4, Prex2, Arl15, Col4a1
## Igfbp7, Adgrf5, Dysf, Pecam1, Sema6a, Col4a2, Samd12, Egfl7, Epas1, Hmcn1
## PC_ 3
## Positive: Mrc1, C1qa, Apoe, F13a1, C1qb, Pf4, C1qc, Cd74, Slc9a9, Dab2
## Pid1, Ccl8, Rbpj, H2-Eb1, Stab1, H2-Aa, Selenop, Csf1r, H2-Ab1, Slc8a1
## Ctsb, Rnf150, Ctsc, Cd163, Fcrls, Ms4a7, Maf, Trf, Adgre1, Cst3
## Negative: S100a9, S100a8, Retnlg, Lcn2, Mmp8, Wfdc21, Ngp, S100a6, Pglyrp1, Hp
## Camp, Ltf, Mmp9, Anxa1, Gda, S100a11, Cxcr2, Gsr, Hdc, Ifitm6
## Pygl, Slpi, Cd177, Mxd1, Lyst, Abca13, Chil3, Msrb1, Sell, Csf3r
## PC_ 4
## Positive: S100a9, Lyz2, S100a8, Retnlg, Lcn2, Mmp8, Wfdc21, Ngp, Mrc1, Mmp9
## C1qa, Hp, S100a6, Pglyrp1, Apoe, Camp, F13a1, Gda, Ltf, C1qb
## Anxa1, Ifitm6, Tyrobp, Pf4, Cxcr2, Csf3r, Gsr, Cd177, C1qc, S100a11
## Negative: Kcnq5, Skap1, Inpp4b, Aff3, Pax5, Ikzf3, Bach2, Cd79a, Tmem163, Il7r
## St6galnac3, Ms4a1, Ly6d, Tox, Ebf1, Prkcq, Itk, Il1rl1, Bank1, Bcl11b
## Cacna1e, Cd79b, Bcl2, Gata3, Bcl11a, Grap2, Gm2682, Pde7a, Dach2, Agbl1
## PC_ 5
## Positive: Skap1, Inpp4b, St6galnac3, Il1rl1, Tox, Itk, Prkcq, Il7r, Bcl11b, Dach2
## Gata3, Kcnq5, Gm2682, Tnik, Themis, Camk4, Cd226, Large1, Ikzf2, Ms4a4b
## Plxdc2, Icos, Ptpn13, Cd247, 1700113H08Rik, Pard3, Cxcr6, Rnf128, Chn2, Plcl1
## Negative: Pax5, Aff3, Cd79a, Ebf1, Ms4a1, Tmem163, Bank1, Ly6d, Bach2, Cd79b
## Arhgap24, Bcl11a, Agbl1, Cd74, Cacna1e, Chst3, Vpreb3, Klhl14, Gm30211, Ikzf3
## Btla, Cecr2, Fcmr, Gm15987, Pou2af1, Siglecg, Ralgps2, Ighd, Sox5, Ppm1e
# Reset idents and levels
DefaultAssay(mouse.filtered) <- "SCT"
Idents(mouse.filtered) <- "sample"
mouse.filtered$sample <- factor(mouse.filtered$sample,
levels = sample_order)
mouse.filtered$sex <- factor(mouse.filtered$sex,
levels = sex_order)
mouse.filtered$isoform <- factor(mouse.filtered$isoform,
levels = isoform_order)
# Plot PCA
pca1 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "sample",
group.by = "sample",
cols = sample_colors,
ncol = 2)
pca1
pca2 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
pca2
pca3 <- DimPlot(mouse.filtered,
reduction = "pca",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
pca3
pca4 <- DimPlot(mouse.filtered,
reduction = "pca",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE,
raster = FALSE)
pca4
e1 <- ElbowPlot(mouse.filtered) +
geom_vline(xintercept = 15, linetype = "dashed", colour = "red")
e1
# integrate
mouse.filtered <- IntegrateLayers(mouse.filtered,
method = "HarmonyIntegration",
assay = "SCT",
orig.reduction = "pca")
## Transposing data matrix
## Using automatic lambda estimation
## Initializing state using k-means centroids initialization
## Harmony 1/10
## Harmony 2/10
## Harmony 3/10
## Harmony converged after 3 iterations
# re-join layers
mouse.filtered[["RNA"]] <- JoinLayers(mouse.filtered[["RNA"]])
hrm1 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "sample",
group.by = "sample",
cols = sample_colors,
ncol = 2)
hrm1
hrm2 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "isoform",
group.by = "isoform",
cols = isoform_colors)
hrm2
hrm3 <- DimPlot(mouse.filtered,
reduction = "harmony",
split.by = "sex",
group.by = "sex",
cols = sex_colors)
hrm3
hrm4 <- DimPlot(mouse.filtered,
reduction = "harmony",
group.by = "sample",
cols = sample_colors,
shuffle = TRUE,
raster = FALSE)
hrm4
# Plot elbow
mouse.filtered@reductions$harmony@stdev <-
apply(mouse.filtered@reductions$harmony@cell.embeddings, 2, sd)
e1 <- ElbowPlot(mouse.filtered,
reduction = "harmony") +
geom_vline(xintercept = 15, linetype = "dashed", color = "red")
e1
To overcome the extensive technical noise in the expression of any single gene for scRNA-seq data, Seurat assigns cells to clusters based on their PCA scores derived from the expression of the integrated most variable genes, with each PC essentially representing a “metagene” that combines information across a correlated gene set. Determining how many PCs to include in the clustering step is therefore important to ensure that we are capturing the majority of the variation, or cell types, present in our dataset.
# run UMAP
mouse.filtered <- RunUMAP(mouse.filtered,
dims = 1:15,
reduction = "harmony",
assay = "SCT",
n.components = 3) # set to 3 to use with VR
## 23:07:12 UMAP embedding parameters a = 0.9922 b = 1.112
## 23:07:12 Read 16965 rows and found 15 numeric columns
## 23:07:12 Using Annoy for neighbor search, n_neighbors = 30
## 23:07:12 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 23:07:14 Writing NN index file to temp file /tmp/Rtmp5e1K86/file1c65963711b39
## 23:07:14 Searching Annoy index using 1 thread, search_k = 3000
## 23:07:20 Annoy recall = 100%
## 23:07:21 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 23:07:23 Initializing from normalized Laplacian + noise (using RSpectra)
## 23:07:23 Commencing optimization for 200 epochs, with 670746 positive edges
## 23:07:31 Optimization finished
# plot UMAP
DimPlot(mouse.filtered,
cols = sample_colors,
shuffle = TRUE)
# Determine the K-nearest neighbor graph
mouse.unannotated <- FindNeighbors(object = mouse.filtered,
assay = "SCT", # set as default after SCTransform
reduction = "harmony",
dims = 1:15)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
mouse.unannotated <- FindClusters(object = mouse.unannotated,
algorithm = 1, # 1 = Louvain
resolution = seq(0.1, 0.5, by=0.1))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16965
## Number of edges: 512676
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9791
## Number of communities: 13
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16965
## Number of edges: 512676
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9689
## Number of communities: 15
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16965
## Number of edges: 512676
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9600
## Number of communities: 19
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16965
## Number of edges: 512676
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9524
## Number of communities: 22
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 16965
## Number of edges: 512676
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9454
## Number of communities: 24
## Elapsed time: 1 seconds
# 0.1
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.1",
label = TRUE)
# 0.2
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
# 0.3
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.3",
label = TRUE)
# 0.4
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.4",
label = TRUE)
# 0.5
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.5",
label = TRUE)
mouse.unannotated$seurat_clusters <- mouse.unannotated$SCT_snn_res.0.2
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE)
DimPlot(mouse.unannotated,
group.by = "SCT_snn_res.0.2",
label = TRUE,
dims = c(2,3))